# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the dataset
df=pd.read_csv('concrete.csv')

print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

Dataset shape: (1030, 9)

First 5 rows:

# Basic statistics
print("\nDescriptive statistics:")
display(df.describe().T)

Descriptive statistics:

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

Missing values:
cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

# Check data types
print("\nData types:")
print(df.dtypes)

Data types:
cement          float64
slag            float64
ash             float64
water           float64
superplastic    float64
coarseagg       float64
fineagg         float64
age               int64
strength        float64
dtype: object

# Histograms of all features
df.hist(figsize=(15, 12), bins=20)
plt.tight_layout()
plt.show()

# Boxplots for all features
plt.figure(figsize=(15, 8))
df.boxplot()
plt.xticks(rotation=45)
plt.title("Boxplots of Features")
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 10))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix")
plt.show()

# Pairplot of selected features
sns.pairplot(df[['cement', 'water', 'age', 'superplastic', 'strength']])
plt.show()

# Strength distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['strength'], kde=True)
plt.title("Distribution of Concrete Compressive Strength")
plt.xlabel("Strength (MPa)")
plt.show()

# Relationship between age and strength
plt.figure(figsize=(12, 6))
sns.scatterplot(x='age', y='strength', data=df)
plt.title("Strength vs Age")
plt.show()

# Relationship between cement and strength
plt.figure(figsize=(12, 6))
sns.scatterplot(x='cement', y='strength', data=df)
plt.title("Strength vs Cement Content")
plt.show()

# Relationship between water and strength
plt.figure(figsize=(12, 6))
sns.scatterplot(x='water', y='strength', data=df)
plt.title("Strength vs Water Content")
plt.show()

# Create new features
df['water_cement_ratio'] = df['water'] / df['cement']
df['aggregate_ratio'] = df['fineagg'] / df['coarseagg']
df['total_binder'] = df['cement'] + df['slag'] + df['ash']
df['total_aggregate'] = df['coarseagg'] + df['fineagg']

# Log transform age to handle skewness
df['log_age'] = np.log1p(df['age'])

# Check new features
print(df[['water_cement_ratio', 'aggregate_ratio', 'total_binder', 'total_aggregate', 'log_age']].describe())

       water_cement_ratio  aggregate_ratio  total_binder  total_aggregate  \
count         1030.000000      1030.000000   1030.000000      1030.000000   
mean             0.748266         0.801453    409.252039      1746.499417   
std              0.314005         0.115049     92.780669       101.235195   
min              0.266893         0.533369    200.000000      1457.000000   
25%              0.533333         0.736298    336.425000      1679.000000   
50%              0.675349         0.789855    391.300000      1752.900000   
75%              0.935165         0.891673    483.700000      1827.950000   
max              1.882353         1.164887    640.000000      1970.000000   

           log_age  
count  1030.000000  
mean      3.242209  
std       1.110431  
min       0.693147  
25%       2.079442  
50%       3.367296  
75%       4.043051  
max       5.902633

# Visualize new features
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.scatterplot(x='water_cement_ratio', y='strength', data=df)
plt.title("Strength vs Water-Cement Ratio")

plt.subplot(2, 2, 2)
sns.scatterplot(x='aggregate_ratio', y='strength', data=df)
plt.title("Strength vs Aggregate Ratio")

plt.subplot(2, 2, 3)
sns.scatterplot(x='total_binder', y='strength', data=df)
plt.title("Strength vs Total Binder Content")

plt.subplot(2, 2, 4)
sns.scatterplot(x='log_age', y='strength', data=df)
plt.title("Strength vs Log(Age)")

plt.tight_layout()
plt.show()

# Define features and target
X = df.drop('strength', axis=1)
y = df['strength']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler()),
    ('transformer', PowerTransformer(method='yeo-johnson')),
    ('selector', SelectKBest(score_func=f_regression, k=10))
])

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Get selected features
selected_features = preprocessor.named_steps['selector'].get_support()
print("Selected features:", X.columns[selected_features])

Selected features: Index(['cement', 'slag', 'water', 'superplastic', 'coarseagg', 'age',
       'water_cement_ratio', 'total_binder', 'total_aggregate', 'log_age'],
      dtype='object')

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

# Evaluate each model
results = {}
for name, model in models.items():
    # Fit the model
    model.fit(X_train_preprocessed, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 824, number of used features: 10
[LightGBM] [Info] Start training from score 35.765570
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

# Make predictions
y_pred = model.predict(X_test_preprocessed)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Store results
results[name] = {
'RMSE': rmse,
'R2 Score': r2
}

# Print results
print(f"{name}:")
print(f"  RMSE: {rmse:.4f}")
print(f"  R2 Score: {r2:.4f}")
print()

LightGBM:
  RMSE: 4.7674
  R2 Score: 0.9206

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('RMSE')
display(results_df)

# Select best model based on initial results
best_model_name = results_df.index[0]
print(f"Best model from initial evaluation: {best_model_name}")

Best model from initial evaluation: LightGBM

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    }

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train_preprocessed, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth': [3, 5, 7], 'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_mean_squared_error', verbose=1)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth': [3, 5, 7], 'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]},
             scoring='neg_mean_squared_error', verbose=1)

GradientBoostingRegressor(max_depth=5, min_samples_leaf=2, min_samples_split=10,
                          n_estimators=300, random_state=42)

GradientBoostingRegressor(max_depth=5, min_samples_leaf=2, min_samples_split=10,
                          n_estimators=300, random_state=42)

# Get best estimator and parameters
best_model = grid_search.best_estimator_
print("\nBest parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}

# Evaluate best model
y_pred_best = best_model.predict(X_test_preprocessed)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print(f"\nBest model performance:")
print(f"  RMSE: {rmse_best:.4f}")
print(f"  R2 Score: {r2_best:.4f}")

Best model performance:
  RMSE: 5.0804
  R2 Score: 0.9098

# Get feature importance from the best model
feature_importance = best_model.feature_importances_

# Create a DataFrame for visualization
features_df = pd.DataFrame({
    'Feature': X.columns[selected_features],
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=features_df)
plt.title("Feature Importance from Gradient Boosting Model")
plt.tight_layout()
plt.show()

# Calculate residuals
residuals = y_test - y_pred_best

# Plot residuals
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x=y_pred_best, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted Values")

plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True)
plt.title("Distribution of Residuals")

plt.tight_layout()
plt.show()

# Cross-validation scores
cv_scores = cross_val_score(
    best_model,
    preprocessor.transform(X),
    y,
    cv=5,
    scoring='neg_mean_squared_error'
)

# Calculate RMSE from cross-validation
cv_rmse = np.sqrt(-cv_scores.mean())
print(f"Cross-validated RMSE: {cv_rmse:.4f}")

Cross-validated RMSE: 4.1944

# Actual vs Predicted plot
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred_best, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual Strength")
plt.ylabel("Predicted Strength")
plt.title("Actual vs Predicted Strength")
plt.show()

# Install shap if not available
try:
    import shap
except:
    !pip install shap
    import shap

# Initialize SHAP explainer
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test_preprocessed)

# Summary plot
plt.figure()
shap.summary_plot(shap_values, X_test_preprocessed, feature_names=X.columns[selected_features])
plt.show()

	cement	slag	ash	water	superplastic	coarseagg	fineagg	age	strength
0	141.3	212.0	0.0	203.5	0.0	971.8	748.5	28	29.89
1	168.9	42.2	124.3	158.3	10.8	1080.8	796.2	14	23.51
2	250.0	0.0	95.7	187.4	5.5	956.9	861.2	28	29.22
3	266.0	114.0	0.0	228.0	0.0	932.0	670.0	28	45.85
4	154.8	183.4	0.0	193.3	9.1	1047.4	696.7	28	18.29

	count	mean	std	min	25%	50%	75%	max
cement	1030.0	281.167864	104.506364	102.00	192.375	272.900	350.000	540.0
slag	1030.0	73.895825	86.279342	0.00	0.000	22.000	142.950	359.4
ash	1030.0	54.188350	63.997004	0.00	0.000	0.000	118.300	200.1
water	1030.0	181.567282	21.354219	121.80	164.900	185.000	192.000	247.0
superplastic	1030.0	6.204660	5.973841	0.00	0.000	6.400	10.200	32.2
coarseagg	1030.0	972.918932	77.753954	801.00	932.000	968.000	1029.400	1145.0
fineagg	1030.0	773.580485	80.175980	594.00	730.950	779.500	824.000	992.6
age	1030.0	45.662136	63.169912	1.00	7.000	28.000	56.000	365.0
strength	1030.0	35.817961	16.705742	2.33	23.710	34.445	46.135	82.6

Concrete Compressive Strength Prediction¶

1. Data Loading and Initial Exploration¶

2. Exploratory Data Analysis (EDA)¶

3. Feature Engineering¶

4. Data Preprocessing¶

5. Model Building and Evaluation¶

6. Hyperparameter Tuning for Best Model¶

7. Feature Importance Analysis¶

8. Residual Analysis¶

9. Final Model Evaluation¶

10. Model Interpretation with SHAP Values¶

11. Conclusion and Recommendations¶

👨💻 By: Irfan Ullah Khan